R Tutorial
An introduction to R
Introduction
This tutorial is will introduce the reader to
,
a free, open-source statistical computing environment often used with
RStudio, a integrated development environment for
.
R Project Logo
Download
Download at https://www.r-project.org/
Download RStudio at https://rstudio.com/products/rstudio/download/
Calculator
can be used as a super awesome calculator
# 5 + 3 = 8
5 + 3 ## [1] 8
# 24 / (1 + 2) = 8
24 / (1 + 2) ## [1] 8
# 2 * 2 * 2 = 8
2^3 ## [1] 8
# 8 * 8 = 64
sqrt(64) ## [1] 8
# -log10(0.05 / 5000000) = 8
-log10(0.05 / 5000000) ## [1] 8
Functions
has many useful built in functions
1:10## [1] 1 2 3 4 5 6 7 8 9 10
as.character(1:10)## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
rep(1:2, times = 5)## [1] 1 2 1 2 1 2 1 2 1 2
rep(1:5, times = 2)## [1] 1 2 3 4 5 1 2 3 4 5
rep(1:5, each = 2)## [1] 1 1 2 2 3 3 4 4 5 5
rep(1:5, length.out = 7)## [1] 1 2 3 4 5 1 2
seq(5, 50, by = 5)## [1] 5 10 15 20 25 30 35 40 45 50
seq(5, 50, length.out = 5)## [1] 5.00 16.25 27.50 38.75 50.00
paste(1:10, 20:30, sep = "-")## [1] "1-20" "2-21" "3-22" "4-23" "5-24" "6-25" "7-26" "8-27" "9-28" "10-29" "1-30"
paste(1:10, collapse = "-")## [1] "1-2-3-4-5-6-7-8-9-10"
paste0("x", 1:10)## [1] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "x8" "x9" "x10"
min(1:10)## [1] 1
max(1:10)## [1] 10
range(1:10)## [1] 1 10
mean(1:10)## [1] 5.5
sd(1:10)## [1] 3.02765
Custom Functions
Users can also create their own functions
customFunction1 <- function(x, y) {
z <- 100 * x / (x + y)
paste(z, "%")
}
customFunction1(x = 10, y = 90)## [1] "10 %"
customFunction2 <- function(x) {
mymin <- mean(x - sd(x))
mymax <- mean(x) + sd(x)
print(paste("Min =", mymin))
print(paste("Max =", mymax))
}
customFunction2(x = 1:10)## [1] "Min = 2.47234964590251"
## [1] "Max = 8.52765035409749"
for loops and if else
statements
xx <- NULL #creates and empty object
for(i in 1:10) {
xx[i] <- i*3
}
xx## [1] 3 6 9 12 15 18 21 24 27 30
xx %% 2 #gives the remainder when divided by 2## [1] 1 0 1 0 1 0 1 0 1 0
for(i in 1:length(xx)) {
if((xx[i] %% 2) == 0) {
print(paste(xx[i],"is Even"))
} else {
print(paste(xx[i],"is Odd"))
}
}## [1] "3 is Odd"
## [1] "6 is Even"
## [1] "9 is Odd"
## [1] "12 is Even"
## [1] "15 is Odd"
## [1] "18 is Even"
## [1] "21 is Odd"
## [1] "24 is Even"
## [1] "27 is Odd"
## [1] "30 is Even"
# or
ifelse(xx %% 2 == 0, "Even", "Odd")## [1] "Odd" "Even" "Odd" "Even" "Odd" "Even" "Odd" "Even" "Odd" "Even"
paste(xx, ifelse(xx %% 2 == 0, "is Even", "is Odd"))## [1] "3 is Odd" "6 is Even" "9 is Odd" "12 is Even" "15 is Odd" "18 is Even" "21 is Odd" "24 is Even" "27 is Odd" "30 is Even"
Objects
Information can be stored in user defined objects, in multiple forms:
c(): a string of valuesmatrix(): a two dimensional matrix in one formatdata.frame(): a two dimensional matrix where each column can be a different formatlist():
A string…
xc <- 1:10
xc## [1] 1 2 3 4 5 6 7 8 9 10
xc <- c(1,2,3,4,5,6,7,8,9,10)
xc## [1] 1 2 3 4 5 6 7 8 9 10
A matrix…
xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = T)
xm## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 2 3 4 5 6 7 8 9 10
## [2,] 11 12 13 14 15 16 17 18 19 20
## [3,] 21 22 23 24 25 26 27 28 29 30
## [4,] 31 32 33 34 35 36 37 38 39 40
## [5,] 41 42 43 44 45 46 47 48 49 50
## [6,] 51 52 53 54 55 56 57 58 59 60
## [7,] 61 62 63 64 65 66 67 68 69 70
## [8,] 71 72 73 74 75 76 77 78 79 80
## [9,] 81 82 83 84 85 86 87 88 89 90
## [10,] 91 92 93 94 95 96 97 98 99 100
xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = F)
xm## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 11 21 31 41 51 61 71 81 91
## [2,] 2 12 22 32 42 52 62 72 82 92
## [3,] 3 13 23 33 43 53 63 73 83 93
## [4,] 4 14 24 34 44 54 64 74 84 94
## [5,] 5 15 25 35 45 55 65 75 85 95
## [6,] 6 16 26 36 46 56 66 76 86 96
## [7,] 7 17 27 37 47 57 67 77 87 97
## [8,] 8 18 28 38 48 58 68 78 88 98
## [9,] 9 19 29 39 49 59 69 79 89 99
## [10,] 10 20 30 40 50 60 70 80 90 100
A data frame…
xd <- data.frame(
x1 = c("aa","bb","cc","dd","ee",
"ff","gg","hh","ii","jj"),
x2 = 1:10,
x3 = c(1,1,1,1,1,2,2,2,3,3),
x4 = rep(c(1,2), times = 5),
x5 = rep(1:5, times = 2),
x6 = rep(1:5, each = 2),
x7 = seq(5, 50, by = 5),
x8 = log10(1:10),
x9 = (1:10)^3,
x10 = c(T,T,T,F,F,T,T,F,F,F)
)
xd## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## 1 aa 1 1 1 1 1 5 0.0000000 1 TRUE
## 2 bb 2 1 2 2 1 10 0.3010300 8 TRUE
## 3 cc 3 1 1 3 2 15 0.4771213 27 TRUE
## 4 dd 4 1 2 4 2 20 0.6020600 64 FALSE
## 5 ee 5 1 1 5 3 25 0.6989700 125 FALSE
## 6 ff 6 2 2 1 3 30 0.7781513 216 TRUE
## 7 gg 7 2 1 2 4 35 0.8450980 343 TRUE
## 8 hh 8 2 2 3 4 40 0.9030900 512 FALSE
## 9 ii 9 3 1 4 5 45 0.9542425 729 FALSE
## 10 jj 10 3 2 5 5 50 1.0000000 1000 FALSE
A list…
xl <- list(xc, xm, xd)
xl[[1]]## [1] 1 2 3 4 5 6 7 8 9 10
xl[[2]]## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 11 21 31 41 51 61 71 81 91
## [2,] 2 12 22 32 42 52 62 72 82 92
## [3,] 3 13 23 33 43 53 63 73 83 93
## [4,] 4 14 24 34 44 54 64 74 84 94
## [5,] 5 15 25 35 45 55 65 75 85 95
## [6,] 6 16 26 36 46 56 66 76 86 96
## [7,] 7 17 27 37 47 57 67 77 87 97
## [8,] 8 18 28 38 48 58 68 78 88 98
## [9,] 9 19 29 39 49 59 69 79 89 99
## [10,] 10 20 30 40 50 60 70 80 90 100
xl[[3]]## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## 1 aa 1 1 1 1 1 5 0.0000000 1 TRUE
## 2 bb 2 1 2 2 1 10 0.3010300 8 TRUE
## 3 cc 3 1 1 3 2 15 0.4771213 27 TRUE
## 4 dd 4 1 2 4 2 20 0.6020600 64 FALSE
## 5 ee 5 1 1 5 3 25 0.6989700 125 FALSE
## 6 ff 6 2 2 1 3 30 0.7781513 216 TRUE
## 7 gg 7 2 1 2 4 35 0.8450980 343 TRUE
## 8 hh 8 2 2 3 4 40 0.9030900 512 FALSE
## 9 ii 9 3 1 4 5 45 0.9542425 729 FALSE
## 10 jj 10 3 2 5 5 50 1.0000000 1000 FALSE
Selecting Data
xc[5] # 5th element in xc## [1] 5
xd$x3[5] # 5th element in col "x3"## [1] 1
xd[5,"x3"] # row 5, col "x3"## [1] 1
xd$x3 # all of col "x3"## [1] 1 1 1 1 1 2 2 2 3 3
xd[,"x3"] # all rows, col "x3"## [1] 1 1 1 1 1 2 2 2 3 3
xd[3,] # row 3, all cols## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## 3 cc 3 1 1 3 2 15 0.4771213 27 TRUE
xd[c(2,4),c("x4","x5")] # rows 2 & 4, cols "x4" & "x5"## x4 x5
## 2 2 2
## 4 2 4
xl[[3]]$x1 # 3rd object in the list, col "x1## [1] "aa" "bb" "cc" "dd" "ee" "ff" "gg" "hh" "ii" "jj"
regexpr
xx <- data.frame(Name = c("Item 1 (detail 1)",
"Item 20 (detail 20)",
"Item 300 (detail 300)"),
Item = NA,
Detail = NA)
xx$Detail <- substr(xx$Name, regexpr("\\(", xx$Name)+1, regexpr("\\)", xx$Name)-1)
xx$Item <- substr(xx$Name, 1, regexpr("\\(", xx$Name)-2)
xx## Name Item Detail
## 1 Item 1 (detail 1) Item 1 detail 1
## 2 Item 20 (detail 20) Item 20 detail 20
## 3 Item 300 (detail 300) Item 300 detail 300
Data Formats
Data can also be saved in many formats:
- numeric
- integer
- character
- factor
- logical
xd$x3 <- as.character(xd$x3)
xd$x3## [1] "1" "1" "1" "1" "1" "2" "2" "2" "3" "3"
xd$x3 <- as.numeric(xd$x3)
xd$x3## [1] 1 1 1 1 1 2 2 2 3 3
xd$x3 <- as.factor(xd$x3)
xd$x3## [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 1 2 3
xd$x3 <- factor(xd$x3, levels = c("3","2","1"))
xd$x3## [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 3 2 1
xd$x10## [1] TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
as.numeric(xd$x10) # TRUE = 1, FALSE = 0## [1] 1 1 1 0 0 1 1 0 0 0
sum(xd$x10)## [1] 5
Internal structure of an object can be checked with
str()
str(xc) # c()## num [1:10] 1 2 3 4 5 6 7 8 9 10
str(xm) # matrix()## int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
str(xd) # data.frame()## 'data.frame': 10 obs. of 10 variables:
## $ x1 : chr "aa" "bb" "cc" "dd" ...
## $ x2 : int 1 2 3 4 5 6 7 8 9 10
## $ x3 : Factor w/ 3 levels "3","2","1": 3 3 3 3 3 2 2 2 1 1
## $ x4 : num 1 2 1 2 1 2 1 2 1 2
## $ x5 : int 1 2 3 4 5 1 2 3 4 5
## $ x6 : int 1 1 2 2 3 3 4 4 5 5
## $ x7 : num 5 10 15 20 25 30 35 40 45 50
## $ x8 : num 0 0.301 0.477 0.602 0.699 ...
## $ x9 : num 1 8 27 64 125 216 343 512 729 1000
## $ x10: logi TRUE TRUE TRUE FALSE FALSE TRUE ...
str(xl) # list()## List of 3
## $ : num [1:10] 1 2 3 4 5 6 7 8 9 10
## $ : int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
## $ :'data.frame': 10 obs. of 10 variables:
## ..$ x1 : chr [1:10] "aa" "bb" "cc" "dd" ...
## ..$ x2 : int [1:10] 1 2 3 4 5 6 7 8 9 10
## ..$ x3 : num [1:10] 1 1 1 1 1 2 2 2 3 3
## ..$ x4 : num [1:10] 1 2 1 2 1 2 1 2 1 2
## ..$ x5 : int [1:10] 1 2 3 4 5 1 2 3 4 5
## ..$ x6 : int [1:10] 1 1 2 2 3 3 4 4 5 5
## ..$ x7 : num [1:10] 5 10 15 20 25 30 35 40 45 50
## ..$ x8 : num [1:10] 0 0.301 0.477 0.602 0.699 ...
## ..$ x9 : num [1:10] 1 8 27 64 125 216 343 512 729 1000
## ..$ x10: logi [1:10] TRUE TRUE TRUE FALSE FALSE TRUE ...
Packages
Additional libraries can be installed and loaded for use.
install.packages("scales")library(scales)
xx <- data.frame(Values = 1:10)
xx$Rescaled <- rescale(x = xx$Values, to = c(1,30))
xx## Values Rescaled
## 1 1 1.000000
## 2 2 4.222222
## 3 3 7.444444
## 4 4 10.666667
## 5 5 13.888889
## 6 6 17.111111
## 7 7 20.333333
## 8 8 23.555556
## 9 9 26.777778
## 10 10 30.000000
libraries can also be used without having to load them
scales::rescale(1:10, to = c(1,30))## [1] 1.000000 4.222222 7.444444 10.666667 13.888889 17.111111 20.333333 23.555556 26.777778 30.000000
Data Wrangling
R for Data Science - https://r4ds.had.co.nz/
xx <- data.frame(Group = c("X","X","Y","Y","Y","X","X","X","Y","Y"),
Data1 = 1:10,
Data2 = seq(10, 100, by = 10))
xx$NewData1 <- xx$Data1 + xx$Data2
xx$NewData2 <- xx$Data1 * 1000
xx## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
## 5 Y 5 50 55 5000
## 6 X 6 60 66 6000
## 7 X 7 70 77 7000
## 8 X 8 80 88 8000
## 9 Y 9 90 99 9000
## 10 Y 10 100 110 10000
xx$Data1 < 5 # which are less than 5## [1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
xx[xx$Data1 < 5,]## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
xx[xx$Group == "X", c("Group","Data2","NewData1")]## Group Data2 NewData1
## 1 X 10 11
## 2 X 20 22
## 6 X 60 66
## 7 X 70 77
## 8 X 80 88
Data wrangling with tidyverse and pipes
(%>%)
library(tidyverse) # install.packages("tidyverse")
xx <- data.frame(Group = c("X","X","Y","Y","Y","Y","Y","X","X","X")) %>%
mutate(Data1 = 1:10,
Data2 = seq(10, 100, by = 10),
NewData1 = Data1 + Data2,
NewData2 = Data1 * 1000)
xx## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
## 5 Y 5 50 55 5000
## 6 Y 6 60 66 6000
## 7 Y 7 70 77 7000
## 8 X 8 80 88 8000
## 9 X 9 90 99 9000
## 10 X 10 100 110 10000
filter(xx, Data1 < 5)## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
xx %>% filter(Data1 < 5)## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
xx %>% filter(Group == "X") %>%
select(Group, NewColName=Data2, NewData1)## Group NewColName NewData1
## 1 X 10 11
## 2 X 20 22
## 3 X 80 88
## 4 X 90 99
## 5 X 100 110
xs <- xx %>%
group_by(Group) %>%
summarise(Data2_mean = mean(Data2),
Data2_sd = sd(Data2),
NewData2_mean = mean(NewData2),
NewData2_sd = sd(NewData2))
xs## # A tibble: 2 × 5
## Group Data2_mean Data2_sd NewData2_mean NewData2_sd
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 X 60 41.8 6000 4183.
## 2 Y 50 15.8 5000 1581.
xx %>% left_join(xs, by = "Group")## Group Data1 Data2 NewData1 NewData2 Data2_mean Data2_sd NewData2_mean NewData2_sd
## 1 X 1 10 11 1000 60 41.83300 6000 4183.300
## 2 X 2 20 22 2000 60 41.83300 6000 4183.300
## 3 Y 3 30 33 3000 50 15.81139 5000 1581.139
## 4 Y 4 40 44 4000 50 15.81139 5000 1581.139
## 5 Y 5 50 55 5000 50 15.81139 5000 1581.139
## 6 Y 6 60 66 6000 50 15.81139 5000 1581.139
## 7 Y 7 70 77 7000 50 15.81139 5000 1581.139
## 8 X 8 80 88 8000 60 41.83300 6000 4183.300
## 9 X 9 90 99 9000 60 41.83300 6000 4183.300
## 10 X 10 100 110 10000 60 41.83300 6000 4183.300
Read/Write data
xx <- read.csv("data_r_tutorial.csv")
write.csv(xx, "data_r_tutorial.csv", row.names = F)For excel sheets, the package readxl can be used to read
in sheets of data.
library(readxl) # install.packages("readxl")
xx <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data")Tidy Data
Tutorial 1 - https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html
Tutorial 2 - https://r4ds.had.co.nz/tidy-data.html
yy <- xx %>%
group_by(Name, Location) %>%
summarise(Mean_DTF = round(mean(DTF),1)) %>%
arrange(Location)
yy## # A tibble: 9 × 3
## # Groups: Name [3]
## Name Location Mean_DTF
## <chr> <chr> <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh 86.7
## 2 ILL 618 AGL Jessore, Bangladesh 79.3
## 3 Laird AGL Jessore, Bangladesh 76.8
## 4 CDC Maxim AGL Metaponto, Italy 134.
## 5 ILL 618 AGL Metaponto, Italy 138.
## 6 Laird AGL Metaponto, Italy 137.
## 7 CDC Maxim AGL Saskatoon, Canada 52.5
## 8 ILL 618 AGL Saskatoon, Canada 47
## 9 Laird AGL Saskatoon, Canada 56.8
yy <- yy %>% spread(key = Location, value = Mean_DTF)
yy## # A tibble: 3 × 4
## # Groups: Name [3]
## Name `Jessore, Bangladesh` `Metaponto, Italy` `Saskatoon, Canada`
## <chr> <dbl> <dbl> <dbl>
## 1 CDC Maxim AGL 86.7 134. 52.5
## 2 ILL 618 AGL 79.3 138. 47
## 3 Laird AGL 76.8 137. 56.8
yy <- yy %>% gather(key = TraitName, value = Value, 2:4)
yy## # A tibble: 9 × 3
## # Groups: Name [3]
## Name TraitName Value
## <chr> <chr> <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh 86.7
## 2 ILL 618 AGL Jessore, Bangladesh 79.3
## 3 Laird AGL Jessore, Bangladesh 76.8
## 4 CDC Maxim AGL Metaponto, Italy 134.
## 5 ILL 618 AGL Metaponto, Italy 138.
## 6 Laird AGL Metaponto, Italy 137.
## 7 CDC Maxim AGL Saskatoon, Canada 52.5
## 8 ILL 618 AGL Saskatoon, Canada 47
## 9 Laird AGL Saskatoon, Canada 56.8
yy <- yy %>% spread(key = Name, value = Value)
yy## # A tibble: 3 × 4
## TraitName `CDC Maxim AGL` `ILL 618 AGL` `Laird AGL`
## <chr> <dbl> <dbl> <dbl>
## 1 Jessore, Bangladesh 86.7 79.3 76.8
## 2 Metaponto, Italy 134. 138. 137.
## 3 Saskatoon, Canada 52.5 47 56.8
Base Plotting
We will start with some basic plotting using the base function
plot()
Tutorial 1 - http://www.sthda.com/english/wiki/r-base-graphs
Tutorial 2 - https://bookdown.org/rdpeng/exdata/the-base-plotting-system-1.html
# A basic scatter plot
plot(x = xd$x8, y = xd$x9)# Adjust color and shape of the points
plot(x = xd$x8, y = xd$x9, col = "darkred", pch = 0)plot(x = xd$x8, y = xd$x9, col = xd$x4, pch = xd$x4)# Adjust plot type
plot(x = xd$x8, y = xd$x9, type = "line")# Adjust linetype
plot(x = xd$x8, y = xd$x9, type = "line", lty = 2)# Plot lines and points
plot(x = xd$x8, y = xd$x9, type = "both")Now lets create some random and normally distributed data to make some more complicated plots
# 100 random uniformly distributed numbers ranging from 0 - 100
ru <- runif(100, min = 0, max = 100)
ru## [1] 81.6705570 19.9231602 60.3464190 89.4491577 25.3439352 51.1012513 18.5239987 32.8402219 9.0818841 54.1371712 76.9182298 88.0459577 41.8741893
## [14] 19.2536184 6.9636590 49.3995100 36.5875466 67.6282814 58.1118402 54.0691127 33.6665179 21.2524497 67.5091570 18.2982308 20.5095096 80.5118839
## [27] 1.9522965 60.3208120 38.7041595 27.1149748 33.4662541 43.0078849 14.0819566 11.5089501 7.5106913 45.1768260 5.9441768 83.7174287 60.1740185
## [40] 56.0415684 8.1122882 22.7445374 23.7648824 65.4152959 20.8335726 30.7615296 25.0544678 1.7719360 41.6499054 47.2536847 39.1425638 50.1155437
## [53] 84.6957471 72.3730593 29.0792980 78.1721406 27.3433120 1.5521618 9.0052998 30.6846834 56.0114780 80.2791856 53.2666470 78.0121799 9.6723048
## [66] 88.4198642 32.2304213 43.5348061 93.9175251 46.6292494 24.2213152 43.8920871 65.2250337 25.0327242 42.3137431 67.2998458 31.1864458 2.3712701
## [79] 26.9230349 64.4311183 42.0716143 66.1847393 94.4239976 30.3322457 0.2559529 60.7658685 68.3711933 44.3043188 19.7536134 71.7792889 2.6663581
## [92] 47.0049635 54.7205217 16.3287765 26.2880661 95.1362737 24.3824165 93.6017533 8.2493820 52.9011194
plot(x = ru)order(ru)## [1] 85 58 48 27 78 91 37 15 35 41 99 59 9 65 34 33 94 24 7 14 89 2 25 45 22 42 43 71 97 74 47 5 95 79 30 57
## [37] 55 84 60 46 77 67 8 31 21 17 29 51 49 13 81 75 32 68 72 88 36 70 92 50 16 52 6 100 63 20 10 93 61 40 19 39
## [73] 28 3 86 80 73 44 82 76 23 18 87 90 54 11 64 56 62 26 1 38 53 12 66 4 98 69 83 96
ru<- ru[order(ru)]
ru## [1] 0.2559529 1.5521618 1.7719360 1.9522965 2.3712701 2.6663581 5.9441768 6.9636590 7.5106913 8.1122882 8.2493820 9.0052998 9.0818841
## [14] 9.6723048 11.5089501 14.0819566 16.3287765 18.2982308 18.5239987 19.2536184 19.7536134 19.9231602 20.5095096 20.8335726 21.2524497 22.7445374
## [27] 23.7648824 24.2213152 24.3824165 25.0327242 25.0544678 25.3439352 26.2880661 26.9230349 27.1149748 27.3433120 29.0792980 30.3322457 30.6846834
## [40] 30.7615296 31.1864458 32.2304213 32.8402219 33.4662541 33.6665179 36.5875466 38.7041595 39.1425638 41.6499054 41.8741893 42.0716143 42.3137431
## [53] 43.0078849 43.5348061 43.8920871 44.3043188 45.1768260 46.6292494 47.0049635 47.2536847 49.3995100 50.1155437 51.1012513 52.9011194 53.2666470
## [66] 54.0691127 54.1371712 54.7205217 56.0114780 56.0415684 58.1118402 60.1740185 60.3208120 60.3464190 60.7658685 64.4311183 65.2250337 65.4152959
## [79] 66.1847393 67.2998458 67.5091570 67.6282814 68.3711933 71.7792889 72.3730593 76.9182298 78.0121799 78.1721406 80.2791856 80.5118839 81.6705570
## [92] 83.7174287 84.6957471 88.0459577 88.4198642 89.4491577 93.6017533 93.9175251 94.4239976 95.1362737
plot(x = ru)# 100 normally distributed numbers with a mean of 50 and sd of 10
nd <- rnorm(100, mean = 50, sd = 10)
nd## [1] 54.73655 53.82082 39.88419 41.64754 47.98965 57.06954 40.96936 49.84004 55.12125 42.44943 46.70932 49.78026 53.76313 55.81527 43.83744 65.02574
## [17] 42.96959 56.92141 43.85815 46.30687 50.65623 49.84521 66.83908 57.30841 63.05539 59.87108 51.90975 62.23545 45.92669 56.09802 48.58157 63.13106
## [33] 40.76241 45.53545 57.79781 39.78468 39.03552 53.85473 45.55016 59.98798 55.85835 33.21154 34.82597 32.71197 64.87241 42.13625 72.55922 46.91533
## [49] 42.68447 49.96743 38.49866 67.77403 66.71672 38.36470 43.84188 56.43587 60.51253 57.98134 43.83155 63.27473 35.18212 51.33460 49.03777 57.42968
## [65] 64.41157 61.50596 47.15226 67.78564 40.20886 72.04335 52.41359 50.16624 55.07456 52.82352 49.97742 42.65028 40.30640 54.15634 50.91501 58.11665
## [81] 48.14781 37.44628 48.82006 56.58637 59.99336 38.31934 45.39068 39.03479 58.14908 52.16667 58.75832 39.47817 63.97837 59.11677 55.22309 41.35069
## [97] 58.96616 40.53581 78.48044 52.55506
nd <- nd[order(nd)]
nd## [1] 32.71197 33.21154 34.82597 35.18212 37.44628 38.31934 38.36470 38.49866 39.03479 39.03552 39.47817 39.78468 39.88419 40.20886 40.30640 40.53581
## [17] 40.76241 40.96936 41.35069 41.64754 42.13625 42.44943 42.65028 42.68447 42.96959 43.83155 43.83744 43.84188 43.85815 45.39068 45.53545 45.55016
## [33] 45.92669 46.30687 46.70932 46.91533 47.15226 47.98965 48.14781 48.58157 48.82006 49.03777 49.78026 49.84004 49.84521 49.96743 49.97742 50.16624
## [49] 50.65623 50.91501 51.33460 51.90975 52.16667 52.41359 52.55506 52.82352 53.76313 53.82082 53.85473 54.15634 54.73655 55.07456 55.12125 55.22309
## [65] 55.81527 55.85835 56.09802 56.43587 56.58637 56.92141 57.06954 57.30841 57.42968 57.79781 57.98134 58.11665 58.14908 58.75832 58.96616 59.11677
## [81] 59.87108 59.98798 59.99336 60.51253 61.50596 62.23545 63.05539 63.13106 63.27473 63.97837 64.41157 64.87241 65.02574 66.71672 66.83908 67.77403
## [97] 67.78564 72.04335 72.55922 78.48044
plot(x = nd)hist(x = nd)hist(nd, breaks = 20, col = "darkgreen")plot(x = density(nd))boxplot(x = nd)boxplot(x = nd, horizontal = T)ggplot2
Lets be honest, the base plots are ugly! The ggplot2
package gives the user to create a better, more visually appealing
plots. Additional packages such as ggbeeswarm and
ggrepel also contain useful functions to add to the
functionality of ggplot2.
ggplot2 - https://ggplot2.tidyverse.org/
Tutorial 1 - http://r-statistics.co/ggplot2-Tutorial-With-R.html
Tutorial 2 - https://www.statsandr.com/blog/graphics-in-r-with-ggplot2/
The R Graph Gallery - https://www.r-graph-gallery.com/ggplot2-package.html
library(ggplot2)
mp <- ggplot(xd, aes(x = x8, y = x9))
mp + geom_point()mp + geom_point(aes(color = x3, shape = x3), size = 4)mp + geom_line(size = 2)mp + geom_line(aes(color = x3), size = 2)mp + geom_smooth(method = "loess")mp + geom_smooth(method = "lm")xx <- data.frame(data = c(rnorm(50, mean = 40, sd = 10),
rnorm(50, mean = 60, sd = 5)),
group = factor(rep(1:2, each = 50)),
label = c("Label1", rep(NA, 49), "Label2", rep(NA, 49)))
mp <- ggplot(xx, aes(x = data, fill = group))
mp + geom_histogram(color = "black")mp + geom_histogram(color = "black", position = "dodge")mp1 <- mp + geom_histogram(color = "black") + facet_grid(group~.)
mp1mp + geom_density(alpha = 0.5)mp <- ggplot(xx, aes(x = group, y = data, fill = group))
mp + geom_boxplot(color = "black")mp + geom_boxplot() + geom_point()mp + geom_violin() + geom_boxplot(width = 0.1, fill = "white")library(ggbeeswarm)
mp + geom_quasirandom()mp + geom_quasirandom(aes(shape = group))mp2 <- mp + geom_violin() +
geom_boxplot(width = 0.1, fill = "white") +
geom_beeswarm(alpha = 0.5)
library(ggrepel)
mp2 + geom_text_repel(aes(label = label), nudge_x = 0.4)library(ggpubr)
ggarrange(mp1, mp2, ncol = 2, widths = c(2,1),
common.legend = T, legend = "bottom")Statistics
Handbook of Biological Statistics - http://biostathandbook.com/
R Companion for ^ - https://rcompanion.org/rcompanion/a_02.html
# Prep data
lev_Loc <- c("Saskatoon, Canada", "Jessore, Bangladesh", "Metaponto, Italy")
lev_Name <- c("ILL 618 AGL", "CDC Maxim AGL", "Laird AGL")
dd <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data") %>%
mutate(Location = factor(Location, levels = lev_Loc),
Name = factor(Name, levels = lev_Name))
xx <- dd %>%
group_by(Name, Location) %>%
summarise(Mean_DTF = mean(DTF))
xx %>% spread(Location, Mean_DTF)## # A tibble: 3 × 4
## # Groups: Name [3]
## Name `Saskatoon, Canada` `Jessore, Bangladesh` `Metaponto, Italy`
## <fct> <dbl> <dbl> <dbl>
## 1 ILL 618 AGL 47 79.3 138.
## 2 CDC Maxim AGL 52.5 86.7 134.
## 3 Laird AGL 56.8 76.8 137.
# Plot
mp1 <- ggplot(dd, aes(x = Location, y = DTF, color = Name, shape = Name)) +
geom_point(size = 2, alpha = 0.7, position = position_dodge(width=0.5))
mp2 <- ggplot(xx, aes(x = Location, y = Mean_DTF,
color = Name, group = Name, shape = Name)) +
geom_point(size = 2.5, alpha = 0.7) +
geom_line(size = 1, alpha = 0.7) +
theme(legend.position = "top")
ggarrange(mp1, mp2, ncol = 2, common.legend = T, legend = "top")From first glace, it is clear there are differences between genotypes, locations, and genotype x environment (GxE) interactions. Now let’s do a few statistical tests.
summary(aov(DTF ~ Name * Location, data = dd))## Df Sum Sq Mean Sq F value Pr(>F)
## Name 2 88 44 3.476 0.0395 *
## Location 2 65863 32931 2598.336 < 2e-16 ***
## Name:Location 4 560 140 11.044 2.52e-06 ***
## Residuals 45 570 13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As expected, an ANOVA shows statistical significance for genotype (p-value = 0.0395), Location (p-value < 2e-16) and GxE interactions (p-value < 2.52e-06). However, all this tells us is that one genotype is different from the rest, one location is different from the others and that there is GxE interactions. If we want to be more specific, would need to do some multiple comparison tests.
If we only have two things to compare, we could do a t-test.
xx <- dd %>%
filter(Location %in% c("Saskatoon, Canada", "Jessore, Bangladesh")) %>%
spread(Location, DTF)
t.test(x = xx$`Saskatoon, Canada`, y = xx$`Jessore, Bangladesh`)##
## Welch Two Sample t-test
##
## data: xx$`Saskatoon, Canada` and xx$`Jessore, Bangladesh`
## t = -17.521, df = 32.701, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -32.18265 -25.48402
## sample estimates:
## mean of x mean of y
## 52.11111 80.94444
DTF in Saskatoon, Canada is significantly different (p-value < 2.2e-16) from DTF in Jessore, Bangladesh.
xx <- dd %>%
filter(Name %in% c("ILL 618 AGL", "Laird AGL"),
Location == "Metaponto, Italy") %>%
spread(Name, DTF)
t.test(x = xx$`ILL 618 AGL`, y = xx$`Laird AGL`)##
## Welch Two Sample t-test
##
## data: xx$`ILL 618 AGL` and xx$`Laird AGL`
## t = 0.38008, df = 8.0564, p-value = 0.7137
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.059739 7.059739
## sample estimates:
## mean of x mean of y
## 137.8333 136.8333
DTF between ILL 618 AGL and Laird AGL are not significantly different (p-value = 0.7137) in Metaponto, Italy.
pch Plot
xx <- data.frame(x = rep(1:6, times = 5, length.out = 26),
y = rep(5:1, each = 6, length.out = 26),
pch = 0:25)
mp <- ggplot(xx, aes(x = x, y = y, shape = as.factor(pch))) +
geom_point(color = "darkred", fill = "darkblue", size = 5) +
geom_text(aes(label = pch), nudge_x = -0.25) +
scale_shape_manual(values = xx$pch) +
scale_x_continuous(breaks = 6:1) +
scale_y_continuous(breaks = 6:1) +
theme_void() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "Plot symbols in R (pch)",
subtitle = "color = \"darkred\", fill = \"darkblue\"",
x = NULL, y = NULL)
ggsave("pch.png", mp, width = 4.5, height = 3, bg = "white")R Markdown
Tutorials on how to create an R markdown document like this one can be found here:
- https://rmarkdown.rstudio.com/articles_intro.html
- https://rmarkdown.rstudio.com/lesson-1.html
- https://alexd106.github.io/intro2R/Rmarkdown_intro.html